library(tidyverse)
library(lubridate)
library(readr) 
library("ggplot2") 
library("dplyr")
library(xts)
library("lubridate")
library("RColorBrewer")
library("ggthemes")
library("gridExtra")
library("leaflet")
library("highcharter")
library(scales)
library(leaflet.extras)
rats_raw <- read.csv("./Rat_Sightings.csv", na = c("", "NA", "N/A", "Unspecified")) %>%
  janitor::clean_names() %>% 
  mutate(created_date = mdy_hms(created_date)) %>%
  mutate(sighting_year = year(created_date),
         sighting_month_num = month(created_date),
         sighting_month = month(created_date, label = TRUE, abbr = FALSE),
         sighting_day = day(created_date),
         sighting_weekday = wday(created_date, label = TRUE, abbr = FALSE)) 

rats_raw

Overall trend in rats

overall <- rats_raw %>% 
  group_by(sighting_year, sighting_month_num, sighting_day) %>% 
  summarize(count = n()) %>% 
  mutate(date = as.Date(paste(sighting_year, sighting_month_num, sighting_day, sep = "-")))
## `summarise()` has grouped output by 'sighting_year', 'sighting_month_num'. You can override using
## the `.groups` argument.
time_series = xts(overall$count , order.by= overall$date)

hchart(time_series, name = "Rat Sightings") %>% 
  hc_add_theme(hc_theme_darkunica()) %>%
  hc_credits(enabled = TRUE, text = "Sources: City of New York", style = list(fontSize = "12px")) %>%
  hc_title(text = "Time Series of NYC Rat Sightings") %>%
  hc_legend(enabled = TRUE)

There seems to be an increasing trend of rat sightings in NYC from 2009 to 2023.

Trend in rats by borough over time

borough_over_time <- rats_raw %>% 
  group_by(sighting_year, sighting_month_num, sighting_day, borough) %>% 
  summarize(count = n()) %>% 
  mutate(date = as.Date(paste(sighting_year, sighting_month_num, sighting_day, sep = "-"))) %>% 
  filter(!is.na(borough))
## `summarise()` has grouped output by 'sighting_year', 'sighting_month_num', 'sighting_day'. You
## can override using the `.groups` argument.
ggplot(borough_over_time, aes(x = date, y = count, color = borough)) +
  geom_line() +
  labs(title = "Entries Over Time by Borough",
       x = "Year",
       y = "Count") +
  scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
  theme_minimal()

From this graph, we can see that Brooklyn seems to have the most rat sightings in NYC. However, we are unable to see the trend in rat sightings in the Bronx.

borough_over_time_wo_brooklyn <- rats_raw %>% 
  filter(borough != "BROOKLYN") %>% 
  filter(borough != "MANHATTAN") %>% 
  group_by(sighting_year, sighting_month_num, sighting_day, borough) %>% 
  summarize(count = n()) %>% 
  mutate(date = as.Date(paste(sighting_year, sighting_month_num, sighting_day, sep = "-"))) %>% 
  filter(!is.na(borough))
## `summarise()` has grouped output by 'sighting_year', 'sighting_month_num', 'sighting_day'. You
## can override using the `.groups` argument.
borough_over_time_wo_brooklyn
ggplot(borough_over_time_wo_brooklyn, aes(x = date, y = count, color = borough)) +
  geom_line() +
  labs(title = "Entries Over Time by Borough",
       x = "Year",
       y = "Count") +
  scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
  theme_minimal()

It seems like the rat sightings in Bronx and Queens are comparable but further EDA is needed to see where the Bronx stands in terms of rat sightings.

Count of rats by year

by_year <- rats_raw %>% 
  group_by(sighting_year) %>% 
  count() %>% 
  ggplot(aes(x = sighting_year, y = n, fill = n)) + 
  geom_histogram(stat = "identity", position = "dodge") +
  theme(legend.position ='none',axis.title = element_text(),axis.text.x = element_text(size = 12)) +
  xlab("Year") + 
  ylab("Count") +
  geom_text(aes(label = n), vjust = -0.1, size = 3.75) +
  ggtitle('Count of Rat Sightings through the Years') + 
  scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral'))) 
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring
## unknown parameters: `binwidth`, `bins`, and `pad`
by_year

There was a huge jump in total number of rat sightings from 2020 to 2021.

Counts of rats by month

by_month <- rats_raw %>% 
  group_by(sighting_month) %>% 
  count() %>% 
  ggplot(aes(x = sighting_month, y = n, fill = n)) + 
  geom_histogram(stat = "identity", position = "dodge") +
  theme(legend.position ='none',axis.title = element_text(),axis.text.x = element_text(size = 9)) +
  xlab("Month") + 
  ylab("Count") +
  geom_text(aes(label = n), vjust = -0.1, size = 3.75) +
  ggtitle('Count of Rat Sightings by Month') + 
  scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral'))) 
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring
## unknown parameters: `binwidth`, `bins`, and `pad`
by_month

It seems like there are the most rat sightings in the warmer months with the peak being at July.

Counts of rat by day of the week

by_day <- rats_raw %>% 
  group_by(sighting_weekday) %>% 
  count() %>% 
  ggplot(aes(x = sighting_weekday, y = n, fill = n)) + 
  geom_histogram(stat = "identity", position = "dodge") +
  theme(legend.position ='none',axis.title = element_text(),axis.text.x = element_text(size = 12)) +
  xlab("Weekday") + 
  ylab("Count") +
  geom_text(aes(label = n), vjust = -0.1, size = 4) +
  ggtitle('Count of Rat Sightings by Day of Week') + 
  scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral'))) 
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring
## unknown parameters: `binwidth`, `bins`, and `pad`
by_day

There are more rat sighings reported during the weekdays compared to weekends.

by_location_type <- rats_raw %>% 
  group_by(location_type) %>% 
  filter(location_type != "Other (Explain Below)") %>% 
  count() %>% 
  arrange(desc(n)) %>%
  head(30)

by_location_type
ggplot(by_location_type, aes(x = location_type, y = n)) +
  geom_histogram(stat = "identity", position = "dodge") +
  labs(title = "Sightings by Location Type",
       x = "Location Type",
       y = "Count") +
  theme_minimal() + 
  coord_flip() + 
  geom_text(aes(label = n), hjust = -0.01, size = 3) 
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring
## unknown parameters: `binwidth`, `bins`, and `pad`

## Overall Sightings Map and Heat Map

top = 40.917577 # north lat
left = -74.259090 # west long
right = -73.700272 # east long
bottom =  40.477399 # south lat


nyc = rats_raw %>%
  filter(latitude >= bottom) %>%
  filter ( latitude <= top) %>%
  filter( longitude >= left ) %>%
  filter(longitude <= right)

center_lon = median(nyc$longitude,na.rm = TRUE)
center_lat = median(nyc$latitude,na.rm = TRUE)

count = nyc %>%
  group_by(location) %>%
  count()

count 
nyc = merge(nyc, count, by = "location")

factpal = colorFactor("blue", nyc$n)

nyc %>%
leaflet() %>% 
  addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~longitude, lat = ~latitude)  %>%
  setView(lng=center_lon, lat=center_lat,zoom = 10) 
nyc %>%
  leaflet() %>%
  addProviderTiles("Esri.NatGeoWorldMap") %>%
  addHeatmap(lng = ~longitude, lat = ~latitude, intensity = ~(nyc$n), blur = 20, max = 0.05, radius = 15) %>%
  setView(lng=center_lon, lat=center_lat,zoom = 10)